{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from core import google_news_run\n",
    "import json\n",
    "import os\n",
    "import logging\n",
    "\n",
    "logging.basicConfig(\n",
    "    level = logging.ERROR\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('new.txt') as fopen:\n",
    "#     topics = fopen.read().split('\\n')\n",
    "    \n",
    "# topics = [topic.split('. ')[1] for topic in topics]\n",
    "# topics[-1]\n",
    "\n",
    "topics = ['Perdana menteri', \n",
    "          'Menteri Kanan Perdagangan Antarabangsa dan Industri',\n",
    "          'Menteri Kanan Pertahanan',\n",
    "          'Kementerian Kewangan',\n",
    "          'Menteri Kanan Kerja Raya',\n",
    "          'Menteri Kanan Pendidikan',\n",
    "          'Menteri Pengangkutan']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:25:48,679 - DEBUG - Starting new HTTPS connection (1): www.google.com.my:443\n",
      "2020-09-09 14:25:48,814 - DEBUG - https://www.google.com.my:443 \"GET /search?q=perdana+menteri&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2000%2Ccd_max%3A2021&tbm=nws&start=0 HTTP/1.1\" 200 None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "crawling perdana menteri\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:25:49,780 - DEBUG - utf-8  confidence = 0.9690625\n",
      "2020-09-09 14:25:49,780 - DEBUG - SHIFT_JIS Japanese confidence = 0.01\n",
      "2020-09-09 14:25:49,781 - DEBUG - EUC-JP Japanese confidence = 0.01\n",
      "2020-09-09 14:25:49,781 - DEBUG - GB2312 Chinese confidence = 0.01\n",
      "2020-09-09 14:25:49,781 - DEBUG - EUC-KR Korean confidence = 0.01\n",
      "2020-09-09 14:25:49,782 - DEBUG - CP949 Korean confidence = 0.01\n",
      "2020-09-09 14:25:49,783 - DEBUG - Big5 Chinese confidence = 0.01\n",
      "2020-09-09 14:25:49,783 - DEBUG - EUC-TW Taiwan confidence = 0.01\n",
      "2020-09-09 14:25:49,783 - DEBUG - windows-1251 Russian confidence = 0.01\n",
      "2020-09-09 14:25:49,784 - DEBUG - KOI8-R Russian confidence = 0.01\n",
      "2020-09-09 14:25:49,785 - DEBUG - ISO-8859-5 Russian confidence = 0.0\n",
      "2020-09-09 14:25:49,785 - DEBUG - MacCyrillic Russian confidence = 0.0\n",
      "2020-09-09 14:25:49,786 - DEBUG - IBM866 Russian confidence = 0.0\n",
      "2020-09-09 14:25:49,786 - DEBUG - IBM855 Russian confidence = 0.01\n",
      "2020-09-09 14:25:49,788 - DEBUG - ISO-8859-7 Greek confidence = 0.0\n",
      "2020-09-09 14:25:49,790 - DEBUG - windows-1253 Greek confidence = 0.0\n",
      "2020-09-09 14:25:49,793 - DEBUG - ISO-8859-5 Bulgairan confidence = 0.0\n",
      "2020-09-09 14:25:49,794 - DEBUG - windows-1251 Bulgarian confidence = 0.01\n",
      "2020-09-09 14:25:49,795 - DEBUG - TIS-620 Thai confidence = 0.0\n",
      "2020-09-09 14:25:49,796 - DEBUG - ISO-8859-9 Turkish confidence = 0.3844176627484308\n",
      "2020-09-09 14:25:49,797 - DEBUG - windows-1255 Hebrew confidence = 0.0\n",
      "2020-09-09 14:25:49,798 - DEBUG - windows-1255 Hebrew confidence = 0.0\n",
      "2020-09-09 14:25:49,798 - DEBUG - windows-1255 Hebrew confidence = 0.0\n",
      "2020-09-09 14:25:49,799 - DEBUG - utf-8  confidence = 0.9690625\n",
      "2020-09-09 14:25:49,800 - DEBUG - SHIFT_JIS Japanese confidence = 0.01\n",
      "2020-09-09 14:25:49,801 - DEBUG - EUC-JP Japanese confidence = 0.01\n",
      "2020-09-09 14:25:49,801 - DEBUG - GB2312 Chinese confidence = 0.01\n",
      "2020-09-09 14:25:49,802 - DEBUG - EUC-KR Korean confidence = 0.01\n",
      "2020-09-09 14:25:49,803 - DEBUG - CP949 Korean confidence = 0.01\n",
      "2020-09-09 14:25:49,803 - DEBUG - Big5 Chinese confidence = 0.01\n",
      "2020-09-09 14:25:49,804 - DEBUG - EUC-TW Taiwan confidence = 0.01\n",
      "2020-09-09 14:25:49,951 - DEBUG - Starting new HTTPS connection (1): www.nst.com.my:443\n",
      "2020-09-09 14:25:50,038 - DEBUG - https://www.nst.com.my:443 \"GET /news/nation/2020/04/581111/be-aware-perdana-menteri-malaysia-malware HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:50,114 - DEBUG - Starting new HTTPS connection (1): www.nst.com.my:443\n",
      "2020-09-09 14:25:50,198 - DEBUG - https://www.nst.com.my:443 \"GET /news/nation/2020/04/581111/be-aware-perdana-menteri-malaysia-malware HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:50,292 - DEBUG - Starting new HTTPS connection (1): www.cnnindonesia.com:443\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:25:51,107 - DEBUG - https://www.cnnindonesia.com:443 \"GET /internasional/20200229162700-106-479346/muhyiddin-yassin-resmi-jadi-perdana-menteri-baru-malaysia HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:52,100 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:25:52,180 - DEBUG - https://www.bharian.com.my:443 \"GET /berita/nasional/2020/03/660910/pelantikan-perdana-menteri-tak-boleh-dilengahkan HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:52,324 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:25:52,427 - DEBUG - https://www.bharian.com.my:443 \"GET /berita/nasional/2020/03/660910/pelantikan-perdana-menteri-tak-boleh-dilengahkan HTTP/1.1\" 200 None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:25:52,629 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:25:52,715 - DEBUG - https://www.bharian.com.my:443 \"GET /berita/nasional/2020/03/665970/covid-19-perutusan-khas-perdana-menteri-mengenai-perintah-kawalan HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:52,858 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:25:52,923 - DEBUG - https://www.bharian.com.my:443 \"GET /berita/nasional/2020/03/665970/covid-19-perutusan-khas-perdana-menteri-mengenai-perintah-kawalan HTTP/1.1\" 200 None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:25:53,112 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:25:53,206 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/kes-src-bukti-institusi-kehakiman-bebas-dan-berkecuali-perdana-menteri-253136 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:25:53,235 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:25:53,345 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/kes-src-bukti-institusi-kehakiman-bebas-dan-berkecuali-perdana-menteri-253136 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:25:53,391 - DEBUG - Starting new HTTPS connection (1): www.sinarharian.com.my:443\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:25:53,711 - DEBUG - https://www.sinarharian.com.my:443 \"GET /article/64307/BERITA/Nasional/Saya-pulangkan-jawatan-ini-kepada-Perdana-Menteri HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:53,840 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:25:58,943 - DEBUG - https://www.bharian.com.my:443 \"GET /berita/nasional/2020/02/658924/kuasa-perdana-menteri-interim-ditentukan-agong HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:59,100 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:25:59,186 - DEBUG - https://www.bharian.com.my:443 \"GET /berita/nasional/2020/02/658924/kuasa-perdana-menteri-interim-ditentukan-agong HTTP/1.1\" 200 None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:25:59,348 - DEBUG - Starting new HTTPS connection (1): www.sinarharian.com.my:443\n",
      "2020-09-09 14:25:59,676 - DEBUG - https://www.sinarharian.com.my:443 \"GET /article/73103/KHAS/Peralihan-Kerajaan/Tiada-jawatan-Timbalan-Perdana-Menteri HTTP/1.1\" 200 None\n",
      "2020-09-09 14:25:59,833 - DEBUG - Starting new HTTPS connection (1): www.hmetro.com.my:443\n",
      "2020-09-09 14:26:00,649 - DEBUG - https://www.hmetro.com.my:443 \"GET /mutakhir/2020/02/547763/anwar-tiba-di-pejabat-perdana-menteri-metrotv HTTP/1.1\" 200 None\n",
      "2020-09-09 14:26:00,777 - DEBUG - Starting new HTTPS connection (1): www.hmetro.com.my:443\n",
      "2020-09-09 14:26:00,856 - DEBUG - https://www.hmetro.com.my:443 \"GET /mutakhir/2020/02/547763/anwar-tiba-di-pejabat-perdana-menteri-metrotv HTTP/1.1\" 200 None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:26:00,993 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:26:06,002 - DEBUG - https://www.bharian.com.my:443 \"GET /berita/nasional/2020/04/673734/ucapan-perdana-menteri-tan-sri-muhyiddin-yassin HTTP/1.1\" 200 None\n",
      "2020-09-09 14:26:16,177 - DEBUG - Starting new HTTPS connection (1): www.google.com.my:443\n",
      "2020-09-09 14:26:16,341 - DEBUG - https://www.google.com.my:443 \"GET /search?q=perdana+menteri&source=lnt&tbs=cdr%3A1%2Ccd_min%3A2000%2Ccd_max%3A2021&tbm=nws&start=10 HTTP/1.1\" 200 None\n",
      "2020-09-09 14:26:16,693 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:26:16,865 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/sayang-awak-perdana-menteri-lakukan-sesuatu-dipercayai-arahan-rosmah-pada-najib-227471 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:16,893 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:26:16,980 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/sayang-awak-perdana-menteri-lakukan-sesuatu-dipercayai-arahan-rosmah-pada-najib-227471 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:17,004 - DEBUG - Starting new HTTP connection (1): www.astroawani.com:80\n",
      "2020-09-09 14:26:17,043 - DEBUG - http://www.astroawani.com:80 \"GET /berita-malaysia/bukan-niat-sidang-kemuncak-kl-menggantikan-oic-pejabat-perdana-menteri-225552 HTTP/1.1\" 301 183\n",
      "2020-09-09 14:26:17,048 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:26:17,139 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/bukan-niat-sidang-kemuncak-kl-menggantikan-oic-pejabat-perdana-menteri-225552 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:17,158 - DEBUG - Starting new HTTP connection (1): www.astroawani.com:80\n",
      "2020-09-09 14:26:17,197 - DEBUG - http://www.astroawani.com:80 \"GET /berita-malaysia/bukan-niat-sidang-kemuncak-kl-menggantikan-oic-pejabat-perdana-menteri-225552 HTTP/1.1\" 301 183\n",
      "2020-09-09 14:26:17,203 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:26:17,310 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/bukan-niat-sidang-kemuncak-kl-menggantikan-oic-pejabat-perdana-menteri-225552 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:17,335 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:26:24,374 - DEBUG - Download failed on URL https://www.bharian.com.my/berita/nasional/2020/05/690097/muhyiddin-layak-dilantik-perdana-menteri-agong because of HTTPSConnectionPool(host='www.bharian.com.my', port=443): Read timed out. (read timeout=7)\n",
      "2020-09-09 14:26:24,380 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:26:24,484 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/covid-19-perdana-menteri-dijangka-umum-pakej-rangsangan-ekonomi-hari-ini-235496 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:24,514 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:26:24,616 - DEBUG - https://www.astroawani.com:443 \"GET /berita-malaysia/covid-19-perdana-menteri-dijangka-umum-pakej-rangsangan-ekonomi-hari-ini-235496 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:24,641 - DEBUG - Starting new HTTPS connection (1): www.hmetro.com.my:443\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:26:25,359 - DEBUG - https://www.hmetro.com.my:443 \"GET /mutakhir/2020/02/548528/fokus-tertumpu-pada-perdana-menteri-kelapan HTTP/1.1\" 200 None\n",
      "2020-09-09 14:26:25,501 - DEBUG - Starting new HTTPS connection (1): www.hmetro.com.my:443\n",
      "2020-09-09 14:26:25,562 - DEBUG - https://www.hmetro.com.my:443 \"GET /mutakhir/2020/02/548528/fokus-tertumpu-pada-perdana-menteri-kelapan HTTP/1.1\" 200 None\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-09-09 14:26:25,706 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n",
      "2020-09-09 14:26:32,752 - DEBUG - Download failed on URL https://www.bharian.com.my/berita/nasional/2018/04/408403/ucapan-khas-pembubaran-parlimen-ke-13-oleh-perdana-menteri because of HTTPSConnectionPool(host='www.bharian.com.my', port=443): Read timed out. (read timeout=7)\n",
      "2020-09-09 14:26:32,760 - DEBUG - Starting new HTTPS connection (1): www.sinarharian.com.my:443\n",
      "2020-09-09 14:26:33,223 - DEBUG - https://www.sinarharian.com.my:443 \"GET /article/88367/BERITA/Politik/AMK-tolak-Dr-Mahathir-calon-Perdana-Menteri HTTP/1.1\" 200 None\n",
      "2020-09-09 14:26:33,365 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:26:33,468 - DEBUG - https://www.astroawani.com:443 \"GET /berita-politik/ini-proses-pemilihan-perdana-menteri-oleh-yang-di-pertuan-agong-231780 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:33,491 - DEBUG - Starting new HTTPS connection (1): www.astroawani.com:443\n",
      "2020-09-09 14:26:33,603 - DEBUG - https://www.astroawani.com:443 \"GET /berita-politik/ini-proses-pemilihan-perdana-menteri-oleh-yang-di-pertuan-agong-231780 HTTP/1.1\" 200 5197\n",
      "2020-09-09 14:26:33,651 - DEBUG - Starting new HTTPS connection (1): www.bharian.com.my:443\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "found BM/ID article\n"
     ]
    }
   ],
   "source": [
    "for topic in topics:\n",
    "    topic = topic.lower()\n",
    "    # topic = 'isu ' + topic\n",
    "    file = topic + '.json'\n",
    "    if file in os.listdir(os.getcwd()):\n",
    "        print('passed: ', file)\n",
    "        continue\n",
    "    \n",
    "    print('crawling', topic)\n",
    "    results = google_news_run(\n",
    "        topic,\n",
    "        limit = 100000,\n",
    "        year_start = 2000,\n",
    "        year_end = 2021,\n",
    "        debug = False,\n",
    "        sleep_time_every_ten_articles = 10\n",
    "    )\n",
    "\n",
    "    with open(file, 'w') as fopen:\n",
    "        fopen.write(json.dumps(results))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir data\n",
    "!mv *.json data\n",
    "!zip -r data.zip data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
